In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
db_2009 = pd.read_json('data-sample_data-nyctaxi-trips-2009-json_corrigido.json', lines=True)
In [3]:
db_2010 = pd.read_json('data-sample_data-nyctaxi-trips-2010-json_corrigido.json', lines=True)
In [4]:
db_2011 = pd.read_json('data-sample_data-nyctaxi-trips-2011-json_corrigido.json', lines=True)
In [5]:
db_2012 = pd.read_json('data-sample_data-nyctaxi-trips-2012-json_corrigido.json', lines=True)
In [6]:
print(db_2009['vendor_id'].unique())
print(db_2010['vendor_id'].unique())
print(db_2011['vendor_id'].unique())
print(db_2012['vendor_id'].unique())
['CMT' 'VTS' 'DDS' 'TS']
['CMT' 'VTS' 'DDS' 'TS']
['CMT' 'VTS' 'DDS' 'TS']
['CMT' 'VTS' 'DDS' 'TS']
In [7]:
db_pay = pd.read_csv('data-payment_lookup-csv.csv', header=1)
db_vendor = pd.read_csv('data-vendor_lookup-csv.csv', header=0)
In [8]:
db_pay.head()
Out[8]:
payment_type payment_lookup
0 Cas Cash
1 CAS Cash
2 Cre Credit
3 CRE Credit
4 No No Charge
In [9]:
db_vendor.head()
Out[9]:
vendor_id name address city state zip country contact current
0 CMT Creative Mobile Technologies, LLC 950 4th Road Suite 78 Brooklyn NY 11210 USA contactCMT@gmail.com Yes
1 VTS VeriFone Inc 26 Summit St. Flushing NY 11354 USA admin@vtstaxi.com Yes
2 DDS Dependable Driver Service, Inc 8554 North Homestead St. Bronx NY 10472 USA 9778896500 Yes
3 TS Total Solutions Co Five Boroughs Taxi Co. Brooklyn NY 11229 USA mgmt@5btc.com Yes
4 MT Mega Taxi 4 East Jennings St. Brooklyn NY 11228 USA contact@megataxico.com No
In [10]:
db_2009.columns
Out[10]:
Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_longitude', 'pickup_latitude', 'rate_code',
       'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude',
       'payment_type', 'fare_amount', 'surcharge', 'tip_amount',
       'tolls_amount', 'total_amount'],
      dtype='object')
In [11]:
db_2010.columns
Out[11]:
Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_longitude', 'pickup_latitude', 'rate_code',
       'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude',
       'payment_type', 'fare_amount', 'surcharge', 'tip_amount',
       'tolls_amount', 'total_amount'],
      dtype='object')
In [12]:
db_2011.columns
Out[12]:
Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_longitude', 'pickup_latitude', 'rate_code',
       'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude',
       'payment_type', 'fare_amount', 'surcharge', 'tip_amount',
       'tolls_amount', 'total_amount'],
      dtype='object')
In [13]:
db_2012.columns
Out[13]:
Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_longitude', 'pickup_latitude', 'rate_code',
       'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude',
       'payment_type', 'fare_amount', 'surcharge', 'tip_amount',
       'tolls_amount', 'total_amount'],
      dtype='object')
In [14]:
# Quais os 3 maiores ​ vendors ​ em quantidade total de dinheiro arrecadado;

print('2009: ', db_2009.groupby(['vendor_id']).sum()['total_amount'].sort_values(ascending=[False]).head(3))
print(' ')
print('2010: ', db_2010.groupby(['vendor_id']).sum()['total_amount'].sort_values(ascending=[False]).head(3))
print(' ')
print('2011: ', db_2011.groupby(['vendor_id']).sum()['total_amount'].sort_values(ascending=[False]).head(3))
print(' ')
print('2012: ',db_2012.groupby(['vendor_id']).sum()['total_amount'].sort_values(ascending=[False]).head(3))
2009:  vendor_id
CMT    4887271.07
VTS    4760858.50
DDS     678725.43
Name: total_amount, dtype: float64
 
2010:  vendor_id
CMT    4887271.07
VTS    4760858.50
DDS     678725.43
Name: total_amount, dtype: float64
 
2011:  vendor_id
CMT    4887271.07
VTS    4760858.50
DDS     678725.43
Name: total_amount, dtype: float64
 
2012:  vendor_id
CMT    4887271.07
VTS    4760858.50
DDS     678725.43
Name: total_amount, dtype: float64
In [15]:
db_2009 = pd.DataFrame(db_2009)
db_2010 = pd.DataFrame(db_2010)
db_2011 = pd.DataFrame(db_2011)
db_2012 = pd.DataFrame(db_2012)

db_complete = pd.concat([db_2009,db_2010,db_2011,db_2012])
In [16]:
print('2009 - 2012: ', db_complete.groupby(['vendor_id']).sum()['total_amount'].sort_values(ascending=[False]).head(3))
2009 - 2012:  vendor_id
CMT    1.954908e+07
VTS    1.904343e+07
DDS    2.714902e+06
Name: total_amount, dtype: float64
In [17]:
# Qual a distância média percorrida por viagens com no máximo 2 passageiros;

print('distancia média para 2 ou menos passageiros:')

print(db_complete['trip_distance'][db_complete['passenger_count'] <= 2].mean())
    
    
distancia média para 2 ou menos passageiros:
2.6625269962032174
In [18]:
# Faça um histograma da distribuição mensal, nos 4 anos, de corridas pagas em dinheiro;

db_complete.head()
Out[18]:
vendor_id pickup_datetime dropoff_datetime passenger_count trip_distance pickup_longitude pickup_latitude rate_code store_and_fwd_flag dropoff_longitude dropoff_latitude payment_type fare_amount surcharge tip_amount tolls_amount total_amount
0 CMT 2009-04-21T18:51:11.767205+00:00 2009-04-21T18:57:09.433767+00:00 2 0.80 -74.004114 40.742950 NaN NaN -73.994712 40.747950 Cash 5.4 0.0 0.0 0.0 5.4
1 CMT 2009-01-13T07:40:07.639754+00:00 2009-01-13T07:50:36.386011+00:00 1 5.40 -73.996506 40.747784 NaN NaN -73.940449 40.792385 Cash 15.4 0.0 0.0 0.0 15.4
2 VTS 2009-01-06T19:30:44.142187+00:00 2009-01-06T19:43:09.908429+00:00 5 1.67 -73.951340 40.752070 NaN NaN -73.944535 40.768108 CASH 6.1 1.0 0.0 0.0 7.1
3 VTS 2009-09-24T08:28:09.004451+00:00 2009-09-24T08:41:56.739991+00:00 1 4.13 -74.001117 40.729128 NaN NaN -73.958122 40.721812 CASH 12.9 0.5 0.0 0.0 13.4
4 VTS 2009-10-29T05:50:39.214629+00:00 2009-10-29T05:57:22.880174+00:00 1 2.03 -73.976600 40.756873 NaN NaN -73.979757 40.776075 CASH 7.7 0.5 0.0 0.0 8.2
In [19]:
db_complete['payment_type'].unique()
Out[19]:
array(['Cash', 'CASH', 'Credit', 'CREDIT', 'No Charge', 'Dispute'],
      dtype=object)
In [20]:
db_2009['dropoff_datetime'] = pd.to_datetime(db_2009['dropoff_datetime'])
nov_mask_2009 = db_2009['dropoff_datetime'].map(lambda x: x.month)
db_2009['dropoff_months'] = nov_mask_2009 
In [21]:
x = db_2009['dropoff_months'][db_2009['payment_type'] == 'Cash']
y = db_2009['dropoff_months'][db_2009['payment_type'] == 'CASH']
filter_trip_2009 = pd.concat([x,y])
In [22]:
db_2010['dropoff_datetime'] = pd.to_datetime(db_2010['dropoff_datetime'])
nov_mask_2010 = db_2010['dropoff_datetime'].map(lambda x: x.month)
db_2010['dropoff_months'] = nov_mask_2010 

x = db_2010['dropoff_months'][db_2010['payment_type'] == 'Cash']
y = db_2010['dropoff_months'][db_2010['payment_type'] == 'CASH']
filter_trip_2010 = pd.concat([x,y])
In [23]:
db_2011['dropoff_datetime'] = pd.to_datetime(db_2011['dropoff_datetime'])
nov_mask_2011 = db_2011['dropoff_datetime'].map(lambda x: x.month)
db_2011['dropoff_months'] = nov_mask_2011

x = db_2011['dropoff_months'][db_2011['payment_type'] == 'Cash']
y = db_2011['dropoff_months'][db_2011['payment_type'] == 'CASH']
filter_trip_2011 = pd.concat([x,y])
In [24]:
db_2012['dropoff_datetime'] = pd.to_datetime(db_2012['dropoff_datetime'])
nov_mask_2012 = db_2012['dropoff_datetime'].map(lambda x: x.month)
db_2012['dropoff_months'] = nov_mask_2012 

x = db_2012['dropoff_months'][db_2012['payment_type'] == 'Cash']
y = db_2012['dropoff_months'][db_2012['payment_type'] == 'CASH']
filter_trip_2012 = pd.concat([x,y])
In [63]:
## Faça um histograma da distribuição mensal, nos 4 anos, de corridas pagas em dinheiro;
plt.figure(figsize=(20,10))
plt.hist(filter_trip_2009, color='g', alpha = 0.5, label=2009)
plt.hist(filter_trip_2010, color='r', alpha = 0.5, label=2010)
plt.hist(filter_trip_2011, color='b', alpha = 0.5, label=2011)
plt.hist(filter_trip_2012, color='y', alpha = 0.5, label=2012)
plt.xlabel('Months')
plt.ylabel('Number of trips')
plt.title('Monthly distribution of cash payed trips')
plt.legend()
plt.show()
In [46]:
db_2012['dropoff_datetime'] = pd.to_datetime(db_2012['dropoff_datetime'])
nov_days_2012 = db_2012['dropoff_datetime'].map(lambda x: x.month)
db_2012['dropoff_days'] = nov_mask_2012 
In [75]:
# separando os ultimos três meses

last_three_months = db_2012[db_2012['dropoff_months'] >= 8]
last_three_months['dropoff_months'].unique()
Out[75]:
array([ 8,  9, 10])
In [76]:
last_three_months
Out[76]:
vendor_id pickup_datetime dropoff_datetime passenger_count trip_distance pickup_longitude pickup_latitude rate_code store_and_fwd_flag dropoff_longitude dropoff_latitude payment_type fare_amount surcharge tip_amount tolls_amount total_amount dropoff_months dropoff_days last_three_months
1 CMT 2012-08-14T21:44:14.940659+00:00 2012-08-14 21:54:50.018630+00:00 1 5.40 -73.996506 40.747784 NaN NaN -73.940449 40.792385 Cash 15.4 0.0 0.0 0.0 15.4 8 8 NaN
5 VTS 2012-09-18T08:04:16.536019+00:00 2012-09-18 08:11:57.381861+00:00 1 1.15 -73.994665 40.750528 NaN NaN -73.978623 40.750478 CASH 6.5 0.0 0.0 0.0 6.5 9 9 NaN
9 VTS 2012-10-17T11:28:59.571708+00:00 2012-10-17 11:43:49.819286+00:00 5 0.75 -74.001978 40.740683 NaN NaN -73.993722 40.747330 CASH 3.7 0.0 0.0 0.0 3.7 10 10 10.0
10 VTS 2012-09-19T11:33:49.445171+00:00 2012-09-19 11:42:46.213767+00:00 2 1.11 0.000000 0.000000 NaN NaN 0.000000 0.000000 CASH 5.7 1.0 0.0 0.0 6.7 9 9 NaN
11 VTS 2012-10-17T21:46:51.187166+00:00 2012-10-17 21:52:24.351695+00:00 1 4.02 -73.996177 40.686802 NaN NaN -73.990310 40.731687 Credit 11.7 0.5 2.0 0.0 14.2 10 10 10.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
999986 CMT 2012-09-07T21:24:33.681647+00:00 2012-09-07 21:30:38.864170+00:00 2 2.50 -73.950933 40.785989 NaN NaN -73.964355 40.756572 Cash 8.6 0.0 0.0 0.0 8.6 9 9 NaN
999990 CMT 2012-10-11T18:53:47.718562+00:00 2012-10-11 19:05:08.562426+00:00 3 1.00 -73.955900 40.772290 NaN NaN -73.946428 40.774825 Cash 5.8 0.0 0.0 0.0 5.8 10 10 10.0
999993 CMT 2012-10-24T02:13:06.454046+00:00 2012-10-24 02:22:27.586879+00:00 1 1.40 -73.986315 40.686709 NaN NaN -73.969958 40.688731 Cash 7.8 0.0 0.0 0.0 7.8 10 10 10.0
999997 CMT 2012-09-15T15:00:58.80813+00:00 2012-09-15 15:08:10.313917+00:00 4 2.90 -74.006156 40.743763 NaN NaN -73.974005 40.764069 Cash 10.6 0.0 0.0 0.0 10.6 9 9 NaN
999999 CMT 2012-08-24T07:25:13.503063+00:00 2012-08-24 07:38:33.870016+00:00 1 2.00 -73.988829 40.731162 NaN NaN -74.009790 40.710155 Cash 7.8 0.0 0.0 0.0 7.8 8 8 NaN

292797 rows × 20 columns

In [79]:
tip_last_90 = last_three_months['tip_amount'][last_three_months['dropoff_days'] != 'NaN']
tip_last_90
/home/filipe/Documents/env/lib/python3.7/site-packages/pandas/core/ops/__init__.py:1115: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  result = method(y)
Out[79]:
1         0.0
5         0.0
9         0.0
10        0.0
11        2.0
         ... 
999986    0.0
999990    0.0
999993    0.0
999997    0.0
999999    0.0
Name: tip_amount, Length: 292797, dtype: float64
In [81]:
# Faça um gráfico de série temporal contando a quantidade de gorjetas de cada dia, nos
# últimos 3 meses de 2012.
plt.figure(figsize=(20,10))
plt.plot(tip_last_90)
plt.ylabel('tip_amount')
plt.xlabel('days')
plt.title('Amount of tips last 3 months')
plt.show()
In [88]:
!pip install plotly==4.5.0
Collecting plotly==4.5.0
  Downloading https://files.pythonhosted.org/packages/06/e1/88762ade699460dc3229c890f9845d16484a40955a590b65052f0958613c/plotly-4.5.0-py2.py3-none-any.whl (7.1MB)
    100% |████████████████████████████████| 7.1MB 184kB/s eta 0:00:01
Requirement already satisfied: six in /home/filipe/Documents/env/lib/python3.7/site-packages (from plotly==4.5.0) (1.13.0)
Collecting retrying>=1.3.3 (from plotly==4.5.0)
  Downloading https://files.pythonhosted.org/packages/44/ef/beae4b4ef80902f22e3af073397f079c96969c69b2c7d52a57ea9ae61c9d/retrying-1.3.3.tar.gz
Building wheels for collected packages: retrying
  Running setup.py bdist_wheel for retrying ... error
  Complete output from command /home/filipe/Documents/env/bin/python3 -u -c "import setuptools, tokenize;__file__='/tmp/pip-install-hskyl05u/retrying/setup.py';f=getattr(tokenize, 'open', open)(__file__);code=f.read().replace('\r\n', '\n');f.close();exec(compile(code, __file__, 'exec'))" bdist_wheel -d /tmp/pip-wheel-yv1jqsgw --python-tag cp37:
  usage: -c [global_opts] cmd1 [cmd1_opts] [cmd2 [cmd2_opts] ...]
     or: -c --help [cmd1 cmd2 ...]
     or: -c --help-commands
     or: -c cmd --help
  
  error: invalid command 'bdist_wheel'
  
  ----------------------------------------
  Failed building wheel for retrying
  Running setup.py clean for retrying
Failed to build retrying
Installing collected packages: retrying, plotly
  Running setup.py install for retrying ... done
Successfully installed plotly-4.5.0 retrying-1.3.3
In [91]:
!pip install the chart-studio
Collecting the
  Downloading https://files.pythonhosted.org/packages/1e/39/f7d201c5afcccc4ed9654d0e47c57364a22fa5efd25fe9dd39d564541505/the-0.1.5.tar.gz
Collecting chart-studio
  Downloading https://files.pythonhosted.org/packages/b9/3f/d2f3f506ba1aaf109f549f8b01d1483cd3e324c5ebe6b206acee66efdf46/chart_studio-1.0.0-py3-none-any.whl (76kB)
    100% |████████████████████████████████| 81kB 1.0MB/s ta 0:00:01
Requirement already satisfied: retrying>=1.3.3 in /home/filipe/Documents/env/lib/python3.7/site-packages (from chart-studio) (1.3.3)
Collecting requests (from chart-studio)
  Downloading https://files.pythonhosted.org/packages/51/bd/23c926cd341ea6b7dd0b2a00aba99ae0f828be89d72b2190f27c11d4b7fb/requests-2.22.0-py2.py3-none-any.whl (57kB)
    100% |████████████████████████████████| 61kB 3.4MB/s eta 0:00:01
Requirement already satisfied: plotly in /home/filipe/Documents/env/lib/python3.7/site-packages (from chart-studio) (4.5.0)
Requirement already satisfied: six in /home/filipe/Documents/env/lib/python3.7/site-packages (from chart-studio) (1.13.0)
Collecting idna<2.9,>=2.5 (from requests->chart-studio)
  Downloading https://files.pythonhosted.org/packages/14/2c/cd551d81dbe15200be1cf41cd03869a46fe7226e7450af7a6545bfc474c9/idna-2.8-py2.py3-none-any.whl (58kB)
    100% |████████████████████████████████| 61kB 2.6MB/s ta 0:00:011
Collecting chardet<3.1.0,>=3.0.2 (from requests->chart-studio)
  Downloading https://files.pythonhosted.org/packages/bc/a9/01ffebfb562e4274b6487b4bb1ddec7ca55ec7510b22e4c51f14098443b8/chardet-3.0.4-py2.py3-none-any.whl (133kB)
    100% |████████████████████████████████| 143kB 3.3MB/s ta 0:00:01
Collecting certifi>=2017.4.17 (from requests->chart-studio)
  Downloading https://files.pythonhosted.org/packages/b9/63/df50cac98ea0d5b006c55a399c3bf1db9da7b5a24de7890bc9cfd5dd9e99/certifi-2019.11.28-py2.py3-none-any.whl (156kB)
    100% |████████████████████████████████| 163kB 2.9MB/s eta 0:00:01
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 (from requests->chart-studio)
  Downloading https://files.pythonhosted.org/packages/e8/74/6e4f91745020f967d09332bb2b8b9b10090957334692eb88ea4afe91b77f/urllib3-1.25.8-py2.py3-none-any.whl (125kB)
    100% |████████████████████████████████| 133kB 3.4MB/s eta 0:00:01
Building wheels for collected packages: the
  Running setup.py bdist_wheel for the ... error
  Complete output from command /home/filipe/Documents/env/bin/python3 -u -c "import setuptools, tokenize;__file__='/tmp/pip-install-piy7l95v/the/setup.py';f=getattr(tokenize, 'open', open)(__file__);code=f.read().replace('\r\n', '\n');f.close();exec(compile(code, __file__, 'exec'))" bdist_wheel -d /tmp/pip-wheel-qviha1mo --python-tag cp37:
  usage: -c [global_opts] cmd1 [cmd1_opts] [cmd2 [cmd2_opts] ...]
     or: -c --help [cmd1 cmd2 ...]
     or: -c --help-commands
     or: -c cmd --help
  
  error: invalid command 'bdist_wheel'
  
  ----------------------------------------
  Failed building wheel for the
  Running setup.py clean for the
Failed to build the
Installing collected packages: the, idna, chardet, certifi, urllib3, requests, chart-studio
  Running setup.py install for the ... done
Successfully installed certifi-2019.11.28 chardet-3.0.4 chart-studio-1.0.0 idna-2.8 requests-2.22.0 the-0.1.5 urllib3-1.25.8
In [149]:
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode

init_notebook_mode(connected=True)

token = 'https://api.mapbox.com/styles/v1/filipescm/ck6gqq8bx3ljh1ip6xr1dwkkv.html?fresh=true&title=copy&access_token=pk.eyJ1IjoiZmlsaXBlc2NtIiwiYSI6ImNrNmdxbzRkNDBtdTUza210ZDdwMWFlczUifQ.u6sDxXHhFYrgvbEM0cHUJw'

fig = go.Figure(go.Scattermapbox(
        lat=db_2010['pickup_latitude'],
        lon=db_2010['pickup_longitude'],
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=14
        ),
        text=['Pickup Points'],
    ))

fig.update_layout(
    hovermode='closest',
    mapbox=go.layout.Mapbox(
        accesstoken=token,
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=40.783,
            lon=-73.966
        ),
        pitch=0,
        zoom=5
    )
)


fig.show()
In [ ]: